{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import statsmodels.api as sm\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "np.random.seed(4873)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('./data/simple_example.csv')\n",
    "Y = data[['y']]\n",
    "X = data[['x']]\n",
    "# 加入新的随机变量，此变量的系数应为0\n",
    "X['z'] = np.random.randint(2, size=20)\n",
    "# 加入常量变量\n",
    "X = sm.add_constant(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 构建并训练模型\n",
    "model = sm.OLS(Y, X)\n",
    "re = model.fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.963\n",
      "Model:                            OLS   Adj. R-squared:                  0.959\n",
      "Method:                 Least Squares   F-statistic:                     222.8\n",
      "Date:                Fri, 08 Dec 2023   Prob (F-statistic):           6.38e-13\n",
      "Time:                        10:39:38   Log-Likelihood:                -31.141\n",
      "No. Observations:                  20   AIC:                             68.28\n",
      "Df Residuals:                      17   BIC:                             71.27\n",
      "Df Model:                           2                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "const         -0.8983      0.953     -0.942      0.359      -2.910       1.113\n",
      "x              1.0400      0.050     20.722      0.000       0.934       1.146\n",
      "z             -0.3619      0.571     -0.634      0.535      -1.566       0.843\n",
      "==============================================================================\n",
      "Omnibus:                        1.295   Durbin-Watson:                   2.209\n",
      "Prob(Omnibus):                  0.523   Jarque-Bera (JB):                0.852\n",
      "Skew:                          -0.061   Prob(JB):                        0.653\n",
      "Kurtosis:                       1.996   Cond. No.                         66.7\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    }
   ],
   "source": [
    "# 整体统计分析结果\n",
    "print(re.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 生成模型数据\n",
    "np.random.seed(5320)\n",
    "x = np.array(range(0, 20)) / 2\n",
    "error = np.round(np.random.randn(20), 2)\n",
    "y = 0.05 * x + error\n",
    "# 新加入的无关变量z恒等于1\n",
    "z = np.zeros(20) + 1\n",
    "data = pd.DataFrame({\"x\": x, \"z\": z, \"y\": y})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                 OLS Regression Results                                \n",
      "=======================================================================================\n",
      "Dep. Variable:                      y   R-squared (uncentered):                   0.204\n",
      "Model:                            OLS   Adj. R-squared (uncentered):              0.162\n",
      "Method:                 Least Squares   F-statistic:                              4.878\n",
      "Date:                Fri, 08 Dec 2023   Prob (F-statistic):                      0.0397\n",
      "Time:                        10:39:38   Log-Likelihood:                         -29.583\n",
      "No. Observations:                  20   AIC:                                      61.17\n",
      "Df Residuals:                      19   BIC:                                      62.16\n",
      "Df Model:                           1                                                  \n",
      "Covariance Type:            nonrobust                                                  \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "x              0.0969      0.044      2.209      0.040       0.005       0.189\n",
      "==============================================================================\n",
      "Omnibus:                        0.871   Durbin-Watson:                   2.037\n",
      "Prob(Omnibus):                  0.647   Jarque-Bera (JB):                0.815\n",
      "Skew:                           0.275   Prob(JB):                        0.665\n",
      "Kurtosis:                       2.179   Cond. No.                         1.00\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] R² is computed without centering (uncentered) since the model does not contain a constant.\n",
      "[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    }
   ],
   "source": [
    "# 没有多余变量时，x系数符号估计正确，为正数\n",
    "model = sm.OLS(data[['y']], data[['x']])\n",
    "re = model.fit()\n",
    "print(re.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            OLS Regression Results                            \n",
      "==============================================================================\n",
      "Dep. Variable:                      y   R-squared:                       0.005\n",
      "Model:                            OLS   Adj. R-squared:                 -0.050\n",
      "Method:                 Least Squares   F-statistic:                   0.09171\n",
      "Date:                Fri, 08 Dec 2023   Prob (F-statistic):              0.765\n",
      "Time:                        10:39:38   Log-Likelihood:                -27.982\n",
      "No. Observations:                  20   AIC:                             59.96\n",
      "Df Residuals:                      18   BIC:                             61.96\n",
      "Df Model:                           1                                         \n",
      "Covariance Type:            nonrobust                                         \n",
      "==============================================================================\n",
      "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "x             -0.0243      0.080     -0.303      0.765      -0.193       0.144\n",
      "z              0.7873      0.445      1.768      0.094      -0.148       1.723\n",
      "==============================================================================\n",
      "Omnibus:                        0.939   Durbin-Watson:                   2.375\n",
      "Prob(Omnibus):                  0.625   Jarque-Bera (JB):                0.886\n",
      "Skew:                           0.338   Prob(JB):                        0.642\n",
      "Kurtosis:                       2.221   Cond. No.                         11.0\n",
      "==============================================================================\n",
      "\n",
      "Notes:\n",
      "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n"
     ]
    }
   ],
   "source": [
    "# 加入多余变量时，x系数符号估计错误，为负数\n",
    "model1 = sm.OLS(data[['y']], data[['x', 'z']])\n",
    "re1 = model1.fit()\n",
    "print(re1.summary())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
